Setup
knitr::opts_chunk$set(echo = TRUE)
Packages
pkgs <- c("tidyverse", "kableExtra", "plotly")
invisible(lapply(pkgs, library, character.only = T))
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Warning: package 'kableExtra' was built under R version 4.2.3
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
Paths
root <- "D:/Documents/DegreesNYC/Degrees_data/Project Examples"
indir <- file.path(root, "Data")
intdir <- file.path(root, "Intermediate")
outdir <- file.path(root, "Output")
Project
Load Data
# data <- read.csv(file.path(indir, "mock_data.csv"))
data <- readxl::read_xlsx(file.path(intdir, "nyc_student_survey_2022.xlsx"), sheet = "Total")
data
## # A tibble: 1,127 × 11
## DBN Schoo…¹ Total…² Total…³ Total…⁴ Colla…⁵ Effec…⁶ Rigor…⁷ Suppo…⁸ Stron…⁹
## <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 01M0… P.S. 0… 0.15 0.694 0.71 N/A N/A N/A N/A N/A
## 2 01M1… P.S. 1… 0.3 0.0552 0.76 N/A N/A N/A N/A N/A
## 3 01M1… P.S. 1… 0.69 0.982 0.93 N/A N/A N/A N/A N/A
## 4 01M1… P.S. 1… 0.65 0.689 0.8 N/A N/A N/A N/A N/A
## 5 01M2… Orchar… 0.34 0.514 0.92 N/A N/A N/A N/A N/A
## 6 01M3… Univer… 0.64 0.840 0.93 N/A N/A N/A N/A N/A
## 7 01M3… School… 0.05 0.383 0.45 N/A N/A N/A N/A N/A
## 8 01M4… Univer… 0.38 0.794 1 N/A N/A N/A N/A N/A
## 9 01M4… East S… 0.68 0.782 0.68 N/A N/A N/A N/A N/A
## 10 01M4… Forsyt… 0.16 0.330 0.73 N/A N/A N/A N/A N/A
## # … with 1,117 more rows, 1 more variable: `Trust Score` <chr>, and abbreviated
## # variable names ¹`School Name`, ²`Total Parent Response Rate`,
## # ³`Total Student Response Rate`, ⁴`Total Teacher Response Rate`,
## # ⁵`Collaborative Teachers Score`, ⁶`Effective School Leadership Score`,
## # ⁷`Rigorous Instruction Score`, ⁸`Supportive Environment Score`,
## # ⁹`Strong Family-Community Ties Score`
clean Data
data %>%
# Keep only Manhattan schools
filter(grepl("\\d{2}M\\d{3}", DBN)) %>%
select(-c(ends_with("Score"))) %>%
# Extract district
mutate(district = substr(DBN, 1, 2)) %>%
{data <<- .}
Descriptive Statistics
dstats <- function(x, y) {
name <- str_remove_all(y, "\\s")
mean_name <- paste0(name, "_mean")
min_name <- paste0(name, "_min")
max_name <- paste0(name, "_max")
x %>%
group_by(district) %>%
summarize(mean_name = mean(!!sym(y)),
min_name = min(!!sym(y)),
max_name = max(!!sym(y))) %>%
setNames(c("District", mean_name, min_name, max_name))
}
map_dfc(c("Total Parent Response Rate", "Total Student Response Rate", "Total Teacher Response Rate"), function(x) dstats(data, x)) %>%
select(-matches("District...[^1]")) %>%
rename(District = `District...1`) %>%
kable() %>%
kable_paper()
## New names:
## • `District` -> `District...1`
## • `District` -> `District...5`
## • `District` -> `District...9`
|
District
|
TotalParentResponseRate_mean
|
TotalParentResponseRate_min
|
TotalParentResponseRate_max
|
TotalStudentResponseRate_mean
|
TotalStudentResponseRate_min
|
TotalStudentResponseRate_max
|
TotalTeacherResponseRate_mean
|
TotalTeacherResponseRate_min
|
TotalTeacherResponseRate_max
|
|
01
|
0.3713333
|
0.04
|
0.69
|
0.6025339
|
0.0552147
|
0.9821429
|
0.7800000
|
0.45
|
1.00
|
|
02
|
0.2993182
|
0.00
|
0.91
|
0.6565823
|
0.0062241
|
0.9646018
|
0.7617045
|
0.00
|
1.00
|
|
03
|
0.3706452
|
0.04
|
1.00
|
0.6445893
|
0.0100000
|
1.0000000
|
0.7383871
|
0.00
|
1.00
|
|
04
|
0.3663158
|
0.00
|
0.92
|
0.7724843
|
0.2476190
|
0.9813505
|
0.7705263
|
0.34
|
1.00
|
|
05
|
0.2929412
|
0.06
|
0.84
|
0.5669682
|
0.1597222
|
1.0000000
|
0.6382353
|
0.31
|
1.00
|
|
06
|
0.6314815
|
0.04
|
1.00
|
0.7377922
|
0.0143885
|
1.0000000
|
0.7570370
|
0.29
|
1.00
|
|
75
|
0.3677778
|
0.15
|
1.00
|
0.5494369
|
0.2500000
|
0.9757576
|
0.5177778
|
0.22
|
0.78
|
|
84
|
0.3497619
|
0.00
|
0.97
|
0.3642449
|
0.0000000
|
0.9971671
|
0.4785714
|
0.00
|
1.00
|
data_dist <- map_dfc(c("Total Parent Response Rate", "Total Student Response Rate", "Total Teacher Response Rate"), function(x) dstats(data, x)) %>%
select(-matches("District...[^1]")) %>%
rename(District = `District...1`)
## New names:
## • `District` -> `District...1`
## • `District` -> `District...5`
## • `District` -> `District...9`
Visualizations
data_dist %>%
rename_with(~ str_to_title(gsub("(Total)(\\w+)(Response)(Rate)_(\\w+)$", "\\1 \\2 \\3 \\4 \\5", .x, perl = T)), starts_with("Total")) %>%
select(District, `Total Student Response Rate Mean`) %>%
ggplot(aes(x = District, y = `Total Student Response Rate Mean`*100, fill = District, label = round(`Total Student Response Rate Mean`*100, 2))) +
geom_bar(stat = "identity") +
geom_text(vjust= -.5) +
scale_y_continuous(limits = c(0, 100)) +
ggtitle("Mean Student Response Rate by District") +
ylab("Response Rate") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())

heatmap <- data_dist %>%
group_by(District) %>%
rename_with(~ str_to_title(gsub("(Total)(\\w+)(Response)(Rate)_(\\w+)$", "\\1 \\2 \\3 \\4 \\5", .x, perl = T)), starts_with("Total")) %>%
select(District, ends_with("Mean")) %>%
pivot_longer(ends_with("Mean"), names_to = "Outcome", values_to = "Value") %>%
ggplot(aes(x = District, y = Outcome, fill = Value)) +
geom_tile() +
ggtitle("Mean Response Rate by District")
ggplotly(heatmap)